listings <- vroom("http://data.insideairbnb.com/turkey/marmara/istanbul/2020-06-28/data/listings.csv.gz")## Rows: 23,728
## Columns: 106
## $ id <dbl> 4826, 20815, 27271, 2827…
## $ listing_url <chr> "https://www.airbnb.com/…
## $ scrape_id <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped <date> 2020-06-28, 2020-06-29,…
## $ name <chr> "The Place", "The Bospho…
## $ summary <chr> "My place is close to gr…
## $ space <chr> "A double bed apartment …
## $ description <chr> "My place is close to gr…
## $ experiences_offered <chr> "none", "none", "none", …
## $ neighborhood_overview <chr> NA, "The lovely neighbor…
## $ notes <chr> NA, "The house may be su…
## $ transit <chr> NA, "The city center, Ta…
## $ access <chr> NA, "Our dear guests may…
## $ interaction <chr> NA, "Depending on our ti…
## $ house_rules <chr> NA, "- Windows facing th…
## $ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url <chr> "https://a0.muscache.com…
## $ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id <dbl> 6603, 78838, 117026, 121…
## $ host_url <chr> "https://www.airbnb.com/…
## $ host_name <chr> "Kaan", "Gülder", "Mutlu…
## $ host_since <date> 2009-01-14, 2010-02-08,…
## $ host_location <chr> "Istanbul, Istanbul, Tur…
## $ host_about <chr> "Hello...\r\nI am Kaan a…
## $ host_response_time <chr> "N/A", "N/A", "N/A", "N/…
## $ host_response_rate <chr> "N/A", "N/A", "N/A", "N/…
## $ host_acceptance_rate <chr> "N/A", "N/A", "50%", "10…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url <chr> "https://a0.muscache.com…
## $ host_picture_url <chr> "https://a0.muscache.com…
## $ host_neighbourhood <chr> "Üsküdar", "Beşiktaş", "…
## $ host_listings_count <dbl> 1, 2, 1, 20, 1, 1, 1, 2,…
## $ host_total_listings_count <dbl> 1, 2, 1, 20, 1, 1, 1, 2,…
## $ host_verifications <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified <lgl> FALSE, FALSE, TRUE, FALS…
## $ street <chr> "Istanbul Province, Ista…
## $ neighbourhood <chr> "Üsküdar", "Beşiktaş", "…
## $ neighbourhood_cleansed <chr> "Uskudar", "Besiktas", "…
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, …
## $ city <chr> "Istanbul Province", "Is…
## $ state <chr> "Istanbul", NA, NA, NA, …
## $ zipcode <chr> "34684", "34345", "34433…
## $ market <chr> "Istanbul", "Istanbul", …
## $ smart_location <chr> "Istanbul Province, Turk…
## $ country_code <chr> "TR", "TR", "TR", "TR", …
## $ country <chr> "Turkey", "Turkey", "Tur…
## $ latitude <dbl> 41.1, 41.1, 41.0, 41.0, …
## $ longitude <dbl> 29.1, 29.0, 29.0, 29.0, …
## $ is_location_exact <lgl> FALSE, TRUE, FALSE, TRUE…
## $ property_type <chr> "Apartment", "Apartment"…
## $ room_type <chr> "Entire home/apt", "Enti…
## $ accommodates <dbl> 2, 3, 2, 5, 2, 3, 2, 2, …
## $ bathrooms <dbl> 1.0, 1.0, 1.0, 1.0, 1.0,…
## $ bedrooms <dbl> 0, 2, 1, 1, 2, 1, 1, 1, …
## $ beds <dbl> 1, 2, 1, 3, 2, 1, 1, 1, …
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet <dbl> 700, NA, NA, 753, 700, 0…
## $ price <chr> "$720.00", "$816.00", "$…
## $ weekly_price <chr> NA, "$1,556.00", "$1,769…
## $ monthly_price <chr> NA, "$5,327.00", "$6,307…
## $ security_deposit <chr> NA, "$679.00", "$769.00"…
## $ cleaning_fee <chr> NA, NA, "$308.00", "$77.…
## $ guests_included <dbl> 2, 4, 2, 2, 6, 1, 1, 2, …
## $ extra_people <chr> "$178.00", "$240.00", "$…
## $ minimum_nights <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_nights <dbl> 730, 900, 90, 360, 60, 1…
## $ minimum_minimum_nights <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_minimum_nights <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ minimum_maximum_nights <dbl> 730, 900, 90, 360, 60, 1…
## $ maximum_maximum_nights <dbl> 730, 900, 90, 360, 60, 1…
## $ minimum_nights_avg_ntm <dbl> 1, 365, 30, 3, 3, 3, 1, …
## $ maximum_nights_avg_ntm <dbl> 730, 900, 90, 360, 60, 1…
## $ calendar_updated <chr> "38 months ago", "7 mont…
## $ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30 <dbl> 30, 13, 28, 30, 28, 29, …
## $ availability_60 <dbl> 60, 26, 58, 60, 58, 59, …
## $ availability_90 <dbl> 90, 36, 80, 90, 88, 89, …
## $ availability_365 <dbl> 365, 279, 289, 365, 88, …
## $ calendar_last_scraped <date> 2020-06-28, 2020-06-29,…
## $ number_of_reviews <dbl> 1, 41, 13, 0, 0, 0, 1, 1…
## $ number_of_reviews_ltm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ first_review <date> 2009-06-01, 2010-03-24,…
## $ last_review <date> 2009-06-01, 2018-11-07,…
## $ review_scores_rating <dbl> 100, 90, 98, NA, NA, NA,…
## $ review_scores_accuracy <dbl> NA, 9, 10, NA, NA, NA, N…
## $ review_scores_cleanliness <dbl> NA, 9, 9, NA, NA, NA, NA…
## $ review_scores_checkin <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_communication <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_location <dbl> NA, 10, 10, NA, NA, NA, …
## $ review_scores_value <dbl> NA, 9, 10, NA, NA, NA, N…
## $ requires_license <lgl> FALSE, FALSE, FALSE, FAL…
## $ license <lgl> NA, NA, NA, NA, NA, NA, …
## $ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> FALSE, FALSE, FALSE, TRU…
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "flexible", "moderate", …
## $ require_guest_profile_picture <lgl> FALSE, TRUE, FALSE, FALS…
## $ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count <dbl> 1, 2, 1, 19, 1, 1, 1, 2,…
## $ calculated_host_listings_count_entire_homes <dbl> 1, 1, 1, 6, 1, 0, 0, 1, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 1, 0, 0, 0, 1, 1, 1, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> 0.01, 0.33, 0.19, NA, NA…
| Name | listings |
| Number of rows | 23728 |
| Number of columns | 106 |
| _______________________ | |
| Column type frequency: | |
| character | 46 |
| Date | 5 |
| logical | 16 |
| numeric | 39 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| listing_url | 0 | 1.00 | 33 | 37 | 0 | 23728 | 0 |
| name | 54 | 1.00 | 1 | 108 | 0 | 22685 | 0 |
| summary | 3779 | 0.84 | 1 | 1000 | 0 | 17202 | 1 |
| space | 11361 | 0.52 | 1 | 1000 | 0 | 10575 | 0 |
| description | 2876 | 0.88 | 1 | 1000 | 0 | 18952 | 0 |
| experiences_offered | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| neighborhood_overview | 12658 | 0.47 | 1 | 1000 | 0 | 8867 | 0 |
| notes | 18474 | 0.22 | 1 | 1000 | 0 | 4221 | 0 |
| transit | 13660 | 0.42 | 1 | 1000 | 0 | 8147 | 0 |
| access | 16337 | 0.31 | 1 | 1000 | 0 | 5943 | 0 |
| interaction | 15019 | 0.37 | 1 | 1000 | 0 | 6607 | 0 |
| house_rules | 16407 | 0.31 | 1 | 1000 | 0 | 6306 | 0 |
| picture_url | 0 | 1.00 | 80 | 146 | 0 | 22996 | 0 |
| host_url | 0 | 1.00 | 38 | 43 | 0 | 14450 | 0 |
| host_name | 1 | 1.00 | 1 | 35 | 0 | 4907 | 0 |
| host_location | 83 | 1.00 | 2 | 105 | 0 | 775 | 0 |
| host_about | 11902 | 0.50 | 1 | 5717 | 0 | 6144 | 10 |
| host_response_time | 1 | 1.00 | 3 | 18 | 0 | 5 | 0 |
| host_response_rate | 1 | 1.00 | 2 | 4 | 0 | 60 | 0 |
| host_acceptance_rate | 1 | 1.00 | 2 | 4 | 0 | 85 | 0 |
| host_thumbnail_url | 1 | 1.00 | 55 | 106 | 0 | 14358 | 0 |
| host_picture_url | 1 | 1.00 | 57 | 109 | 0 | 14358 | 0 |
| host_neighbourhood | 15027 | 0.37 | 4 | 33 | 0 | 59 | 0 |
| host_verifications | 0 | 1.00 | 2 | 158 | 0 | 277 | 0 |
| street | 0 | 1.00 | 10 | 116 | 0 | 1180 | 0 |
| neighbourhood | 5377 | 0.77 | 4 | 15 | 0 | 15 | 0 |
| neighbourhood_cleansed | 0 | 1.00 | 4 | 13 | 0 | 39 | 0 |
| city | 773 | 0.97 | 2 | 69 | 0 | 641 | 0 |
| state | 397 | 0.98 | 1 | 58 | 0 | 293 | 0 |
| zipcode | 2422 | 0.90 | 1 | 43 | 0 | 388 | 0 |
| market | 0 | 1.00 | 7 | 21 | 0 | 3 | 0 |
| smart_location | 0 | 1.00 | 6 | 77 | 0 | 706 | 0 |
| country_code | 0 | 1.00 | 2 | 2 | 0 | 3 | 0 |
| country | 0 | 1.00 | 6 | 12 | 0 | 3 | 0 |
| property_type | 0 | 1.00 | 3 | 22 | 0 | 43 | 0 |
| room_type | 0 | 1.00 | 10 | 15 | 0 | 4 | 0 |
| bed_type | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| amenities | 0 | 1.00 | 2 | 1297 | 0 | 21255 | 0 |
| price | 0 | 1.00 | 5 | 10 | 0 | 501 | 0 |
| weekly_price | 21993 | 0.07 | 6 | 10 | 0 | 581 | 0 |
| monthly_price | 22031 | 0.07 | 7 | 11 | 0 | 628 | 0 |
| security_deposit | 15623 | 0.34 | 5 | 10 | 0 | 357 | 0 |
| cleaning_fee | 13660 | 0.42 | 5 | 9 | 0 | 294 | 0 |
| extra_people | 0 | 1.00 | 5 | 9 | 0 | 215 | 0 |
| calendar_updated | 0 | 1.00 | 5 | 14 | 0 | 104 | 0 |
| cancellation_policy | 0 | 1.00 | 6 | 27 | 0 | 6 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| last_scraped | 0 | 1.00 | 2020-06-28 | 2020-06-30 | 2020-06-29 | 3 |
| host_since | 1 | 1.00 | 2009-01-14 | 2020-06-27 | 2017-08-27 | 3095 |
| calendar_last_scraped | 0 | 1.00 | 2020-06-28 | 2020-06-30 | 2020-06-29 | 3 |
| first_review | 12375 | 0.48 | 2009-06-01 | 2020-06-29 | 2019-04-12 | 2216 |
| last_review | 12375 | 0.48 | 2009-06-01 | 2020-06-29 | 2020-01-01 | 1424 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| thumbnail_url | 23728 | 0 | NaN | : |
| medium_url | 23728 | 0 | NaN | : |
| xl_picture_url | 23728 | 0 | NaN | : |
| host_is_superhost | 1 | 1 | 0.11 | FAL: 21147, TRU: 2580 |
| host_has_profile_pic | 1 | 1 | 1.00 | TRU: 23625, FAL: 102 |
| host_identity_verified | 1 | 1 | 0.16 | FAL: 19951, TRU: 3776 |
| neighbourhood_group_cleansed | 23728 | 0 | NaN | : |
| is_location_exact | 0 | 1 | 0.35 | FAL: 15329, TRU: 8399 |
| has_availability | 0 | 1 | 1.00 | TRU: 23728 |
| requires_license | 0 | 1 | 0.00 | FAL: 23728 |
| license | 23728 | 0 | NaN | : |
| jurisdiction_names | 23728 | 0 | NaN | : |
| instant_bookable | 0 | 1 | 0.60 | TRU: 14249, FAL: 9479 |
| is_business_travel_ready | 0 | 1 | 0.00 | FAL: 23728 |
| require_guest_profile_picture | 0 | 1 | 0.01 | FAL: 23571, TRU: 157 |
| require_guest_phone_verification | 0 | 1 | 0.01 | FAL: 23526, TRU: 202 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2.91e+07 | 1.31e+07 | 4.83e+03 | 2.10e+07 | 3.40e+07 | 3.97e+07 | 4.40e+07 | ▂▂▂▃▇ |
| scrape_id | 0 | 1.00 | 2.02e+13 | 0.00e+00 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | 2.02e+13 | ▁▁▇▁▁ |
| host_id | 0 | 1.00 | 1.49e+08 | 1.16e+08 | 6.60e+03 | 3.29e+07 | 1.48e+08 | 2.59e+08 | 3.52e+08 | ▇▂▃▅▃ |
| host_listings_count | 1 | 1.00 | 2.43e+01 | 2.24e+02 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 4.00e+00 | 3.77e+03 | ▇▁▁▁▁ |
| host_total_listings_count | 1 | 1.00 | 2.43e+01 | 2.24e+02 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 4.00e+00 | 3.77e+03 | ▇▁▁▁▁ |
| latitude | 0 | 1.00 | 4.10e+01 | 5.00e-02 | 4.08e+01 | 4.10e+01 | 4.10e+01 | 4.10e+01 | 4.15e+01 | ▁▇▁▁▁ |
| longitude | 0 | 1.00 | 2.90e+01 | 1.30e-01 | 2.80e+01 | 2.90e+01 | 2.90e+01 | 2.90e+01 | 2.99e+01 | ▁▁▇▁▁ |
| accommodates | 0 | 1.00 | 3.21e+00 | 2.25e+00 | 1.00e+00 | 2.00e+00 | 2.00e+00 | 4.00e+00 | 1.60e+01 | ▇▁▁▁▁ |
| bathrooms | 86 | 1.00 | 1.21e+00 | 1.04e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 5.00e+01 | ▇▁▁▁▁ |
| bedrooms | 173 | 0.99 | 1.39e+00 | 1.44e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 5.00e+01 | ▇▁▁▁▁ |
| beds | 698 | 0.97 | 2.05e+00 | 2.04e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 7.70e+01 | ▇▁▁▁▁ |
| square_feet | 23492 | 0.01 | 6.05e+02 | 1.21e+03 | 0.00e+00 | 7.00e+01 | 5.38e+02 | 8.07e+02 | 1.74e+04 | ▇▁▁▁▁ |
| guests_included | 0 | 1.00 | 1.40e+00 | 1.09e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 1.60e+01 | ▇▁▁▁▁ |
| minimum_nights | 0 | 1.00 | 4.53e+00 | 2.76e+01 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| maximum_nights | 0 | 1.00 | 9.13e+04 | 1.39e+07 | 1.00e+00 | 6.00e+01 | 1.12e+03 | 1.12e+03 | 2.15e+09 | ▇▁▁▁▁ |
| minimum_minimum_nights | 0 | 1.00 | 4.41e+00 | 2.68e+01 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| maximum_minimum_nights | 0 | 1.00 | 4.71e+00 | 2.86e+01 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| minimum_maximum_nights | 0 | 1.00 | 9.28e+02 | 9.68e+03 | 1.00e+00 | 3.60e+02 | 1.12e+03 | 1.12e+03 | 1.00e+06 | ▇▁▁▁▁ |
| maximum_maximum_nights | 0 | 1.00 | 9.29e+02 | 9.68e+03 | 1.00e+00 | 3.60e+02 | 1.12e+03 | 1.12e+03 | 1.00e+06 | ▇▁▁▁▁ |
| minimum_nights_avg_ntm | 0 | 1.00 | 4.51e+00 | 2.70e+01 | 1.00e+00 | 1.00e+00 | 1.00e+00 | 3.00e+00 | 1.12e+03 | ▇▁▁▁▁ |
| maximum_nights_avg_ntm | 0 | 1.00 | 9.29e+02 | 9.68e+03 | 1.00e+00 | 3.60e+02 | 1.12e+03 | 1.12e+03 | 1.00e+06 | ▇▁▁▁▁ |
| availability_30 | 0 | 1.00 | 2.21e+01 | 1.21e+01 | 0.00e+00 | 1.70e+01 | 2.90e+01 | 3.00e+01 | 3.00e+01 | ▂▁▁▁▇ |
| availability_60 | 0 | 1.00 | 4.53e+01 | 2.38e+01 | 0.00e+00 | 4.10e+01 | 5.90e+01 | 6.00e+01 | 6.00e+01 | ▂▁▁▁▇ |
| availability_90 | 0 | 1.00 | 6.90e+01 | 3.51e+01 | 0.00e+00 | 6.60e+01 | 8.90e+01 | 9.00e+01 | 9.00e+01 | ▂▁▁▁▇ |
| availability_365 | 0 | 1.00 | 2.28e+02 | 1.47e+02 | 0.00e+00 | 8.90e+01 | 3.02e+02 | 3.65e+02 | 3.65e+02 | ▃▂▂▁▇ |
| number_of_reviews | 0 | 1.00 | 7.87e+00 | 2.32e+01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 4.00e+00 | 3.45e+02 | ▇▁▁▁▁ |
| number_of_reviews_ltm | 0 | 1.00 | 3.04e+00 | 7.47e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 2.00e+00 | 1.08e+02 | ▇▁▁▁▁ |
| review_scores_rating | 12978 | 0.45 | 9.13e+01 | 1.40e+01 | 2.00e+01 | 9.00e+01 | 9.60e+01 | 1.00e+02 | 1.00e+02 | ▁▁▁▁▇ |
| review_scores_accuracy | 12991 | 0.45 | 9.29e+00 | 1.42e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_cleanliness | 12988 | 0.45 | 9.06e+00 | 1.51e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▂▇ |
| review_scores_checkin | 12991 | 0.45 | 9.52e+00 | 1.28e+00 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_communication | 12987 | 0.45 | 9.55e+00 | 1.24e+00 | 2.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_location | 12991 | 0.45 | 9.44e+00 | 1.23e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| review_scores_value | 12993 | 0.45 | 9.19e+00 | 1.40e+00 | 2.00e+00 | 9.00e+00 | 1.00e+01 | 1.00e+01 | 1.00e+01 | ▁▁▁▁▇ |
| calculated_host_listings_count | 0 | 1.00 | 5.86e+00 | 1.65e+01 | 1.00e+00 | 1.00e+00 | 2.00e+00 | 5.00e+00 | 1.76e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_entire_homes | 0 | 1.00 | 2.81e+00 | 6.37e+00 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 2.00e+00 | 6.60e+01 | ▇▁▁▁▁ |
| calculated_host_listings_count_private_rooms | 0 | 1.00 | 2.46e+00 | 1.51e+01 | 0.00e+00 | 0.00e+00 | 1.00e+00 | 1.00e+00 | 1.75e+02 | ▇▁▁▁▁ |
| calculated_host_listings_count_shared_rooms | 0 | 1.00 | 9.00e-02 | 6.00e-01 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 0.00e+00 | 1.10e+01 | ▇▁▁▁▁ |
| reviews_per_month | 12375 | 0.48 | 7.10e-01 | 9.00e-01 | 1.00e-02 | 1.30e-01 | 3.30e-01 | 9.50e-01 | 9.20e+00 | ▇▁▁▁▁ |
From our glimpse into the data frame, we see that there are 106 columns with a total of 23,728 rows. However of these 106 columns, skim() shows us that only 39 are of the type “numeric”. These include variables such as “bedrooms”, “square feet”, “latitude” and “longitude”. Investigating further, we see that a lot of the columns don’t add anything of value in terms of analysis (e.g. id, listing_url, scrape_id) and will therefore be dropped later on.
However, we also see that some of the columns you’d expect to be numeric (e.g. “price”, “cleaning fee”) are actually served as strings - we will therefore need to cast these for use in our analysis later on.
Checking for factor variables in our dataset, we first dive into the review section of hosts - surely AirBnB must have implemented a review system of 1-10?
listings %>%
select(review_scores_value) %>%
filter(review_scores_value != "NA") %>%
group_by(review_scores_value) %>%
count() %>%
ggplot(., aes(x = review_scores_value, y = n)) + geom_col()Manipulating our dataframe to show the range of review scores, we see that AirBnB indeed has a 1-10 rating system, and judging from the scores given it seems that Istanbul has some great hosts! The reason why we don’t see any 1 scores could perhaps be that AirBnB removes hosts who receive such a score very quickly.
Other categorical values seem to be variables such as * maximum_nights (the max period you can rent an AirBnB is 365 days) * zipcode (Istanbul has a finite amount of zipcodes) * neighbourhood (Istanbul has a finite amount of neighbourhoods)
A few interesting variables were picked out of the dataset and evaluated using GGPairs:
ggpairs_data <- data.frame(listings$price, listings$bedrooms, listings$neighbourhood, listings$accommodates, listings$bathrooms, listings$square_feet)
ggpairs(ggpairs_data, cardinality_threshold = NULL)From this plot, we see that the data is scattered all over, presuming that we can have some trouble building a model that is able to predict the price for a 4 night stay in Istanbul. We see that bathrooms and bedrooms have near linear relationship, but that is pretty much the only strong correlation we see. Getting the predicted price might end up being tricky!
As we saw in our glimpse of the data set, there are a few variables that have no relevancy, or at least cause a lot of noise when trying to manipulate the data. Let’s remove some of the less important columns:
#Keep relevant variables
listings_cleaned <- listings %>%
select(!c(id, scrape_id, last_scraped, experiences_offered, neighborhood_overview, thumbnail_url, thumbnail_url, medium_url, picture_url, xl_picture_url, host_id, host_url, host_name, host_location, host_about, host_thumbnail_url, host_picture_url, country, country_code, calendar_updated, has_availability, calendar_last_scraped, jurisdiction_names))Now let’s turn the faulty character variables into the numerics they should be representing, starting with price
listings_cleaned <- listings_cleaned %>%
mutate(price = parse_number(price))
typeof(listings_cleaned$price)## [1] "double"
Now, let’s turn our focus to the cleaning_fee variable
#First we change it into a numerical variable
listings_cleaned <- listings_cleaned %>%
mutate(cleaning_fee = parse_number(cleaning_fee))
skim(listings_cleaned$cleaning_fee) %>%
kable() %>%
kable_styling()| skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| numeric | data | 13660 | 0.424 | 127 | 178 | 0 | 0 | 80 | 192 | 4569 | ▇▁▁▁▁ |
We see that there’s 13660 rows / ads missing a value for cleaning_fee. This most likely indicates that the ad simply has no cleaning fee associated with renting the apartment.
Let’s turn these missing values into zeroes, which more accurately convey what the dataset is trying to tell us:
listings_cleaned <- listings_cleaned %>%
mutate(cleaning_fee = case_when(
is.na(cleaning_fee) ~ 0, #Fill in a 0 when there is a N/A value
TRUE ~ cleaning_fee
))
#Confirm there's no more missing values
skim(listings_cleaned$cleaning_fee) %>%
kable() %>%
kable_styling()| skim_type | skim_variable | n_missing | complete_rate | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| numeric | data | 0 | 1 | 54.1 | 132 | 0 | 0 | 0 | 60 | 4569 | ▇▁▁▁▁ |
Let’s turn our attention to property_type. First we count how many categories make up the variable’s frequency
property_count <- listings_cleaned %>%
group_by(property_type) %>%
count() %>%
arrange(desc(n))
property_count %>%
kable() %>%
kable_styling()| property_type | n |
|---|---|
| Apartment | 14958 |
| Serviced apartment | 1700 |
| House | 1564 |
| Boutique hotel | 1113 |
| Townhouse | 692 |
| Condominium | 629 |
| Aparthotel | 590 |
| Bed and breakfast | 585 |
| Hotel | 545 |
| Loft | 436 |
| Villa | 328 |
| Hostel | 150 |
| Other | 78 |
| Casa particular (Cuba) | 63 |
| Tiny house | 55 |
| Guesthouse | 54 |
| Guest suite | 32 |
| Earth house | 18 |
| Farm stay | 17 |
| Yurt | 17 |
| Cottage | 13 |
| Camper/RV | 11 |
| Boat | 10 |
| Castle | 10 |
| Chalet | 9 |
| Nature lodge | 8 |
| Treehouse | 7 |
| Pension (South Korea) | 5 |
| Tent | 5 |
| Bungalow | 4 |
| Lighthouse | 4 |
| Houseboat | 3 |
| Campsite | 2 |
| Dome house | 2 |
| Hut | 2 |
| Pousada (Portugal) | 2 |
| Barn | 1 |
| Cabin | 1 |
| Cave | 1 |
| Heritage hotel (India) | 1 |
| Island | 1 |
| Vacation home | 1 |
| Windmill | 1 |
We see that the top 4 most common property types are:
These four make up…
totalproperty_count <- listings_cleaned %>%
select(property_type) %>%
count() #Count total properties in the dataset
property_count %>%
head(4) %>% #Choose the top 4 (apt, serviced apt, house, boutique)
arrange(desc(n)) %>%
ungroup() %>% #Prime for summarization
summarise(sum4 = sum(n)) %>% #Get sum of the top 4 property type
mutate(proportion4 = sum4/totalproperty_count$n) #Calculate the proportion | sum4 | proportion4 |
|---|---|
| 19335 | 0.815 |
A whole 81.5% as seen from the data manipulation above. Let’s now create a simplified version of the property_type variable with 5 categories:
listings_cleaned <- listings_cleaned %>%
mutate(prop_type_simplified = case_when(
property_type %in% c(
"Apartment",
"Serviced apartment",
"House",
"Boutique hotel") ~ property_type, #When the property type matches one of these four, keep the value
TRUE ~ "Other" #Otherwise turn it into "Other"
))
#Checking that our code is correct
listings_cleaned %>%
count(property_type, prop_type_simplified) %>%
arrange(desc(n)) %>%
kable() %>%
kable_styling()| property_type | prop_type_simplified | n |
|---|---|---|
| Apartment | Apartment | 14958 |
| Serviced apartment | Serviced apartment | 1700 |
| House | House | 1564 |
| Boutique hotel | Boutique hotel | 1113 |
| Townhouse | Other | 692 |
| Condominium | Other | 629 |
| Aparthotel | Other | 590 |
| Bed and breakfast | Other | 585 |
| Hotel | Other | 545 |
| Loft | Other | 436 |
| Villa | Other | 328 |
| Hostel | Other | 150 |
| Other | Other | 78 |
| Casa particular (Cuba) | Other | 63 |
| Tiny house | Other | 55 |
| Guesthouse | Other | 54 |
| Guest suite | Other | 32 |
| Earth house | Other | 18 |
| Farm stay | Other | 17 |
| Yurt | Other | 17 |
| Cottage | Other | 13 |
| Camper/RV | Other | 11 |
| Boat | Other | 10 |
| Castle | Other | 10 |
| Chalet | Other | 9 |
| Nature lodge | Other | 8 |
| Treehouse | Other | 7 |
| Pension (South Korea) | Other | 5 |
| Tent | Other | 5 |
| Bungalow | Other | 4 |
| Lighthouse | Other | 4 |
| Houseboat | Other | 3 |
| Campsite | Other | 2 |
| Dome house | Other | 2 |
| Hut | Other | 2 |
| Pousada (Portugal) | Other | 2 |
| Barn | Other | 1 |
| Cabin | Other | 1 |
| Cave | Other | 1 |
| Heritage hotel (India) | Other | 1 |
| Island | Other | 1 |
| Vacation home | Other | 1 |
| Windmill | Other | 1 |
Let’s now turn our attention to the minimum_nights variable
listings_cleaned %>%
group_by(minimum_nights) %>%
count() %>%
arrange(desc(n)) %>%
kable() %>%
kable_styling()| minimum_nights | n |
|---|---|
| 1 | 13228 |
| 2 | 4511 |
| 3 | 2682 |
| 7 | 702 |
| 5 | 701 |
| 4 | 490 |
| 30 | 324 |
| 10 | 185 |
| 15 | 179 |
| 6 | 138 |
| 14 | 69 |
| 20 | 61 |
| 28 | 60 |
| 90 | 52 |
| 60 | 42 |
| 180 | 30 |
| 365 | 27 |
| 8 | 26 |
| 9 | 21 |
| 13 | 21 |
| 27 | 19 |
| 25 | 18 |
| 29 | 16 |
| 120 | 15 |
| 31 | 12 |
| 21 | 9 |
| 360 | 9 |
| 12 | 8 |
| 100 | 6 |
| 150 | 6 |
| 80 | 5 |
| 1000 | 5 |
| 200 | 4 |
| 11 | 3 |
| 17 | 3 |
| 40 | 3 |
| 45 | 3 |
| 50 | 3 |
| 19 | 2 |
| 65 | 2 |
| 88 | 2 |
| 250 | 2 |
| 500 | 2 |
| 24 | 1 |
| 26 | 1 |
| 32 | 1 |
| 35 | 1 |
| 59 | 1 |
| 61 | 1 |
| 77 | 1 |
| 85 | 1 |
| 96 | 1 |
| 118 | 1 |
| 140 | 1 |
| 148 | 1 |
| 183 | 1 |
| 210 | 1 |
| 300 | 1 |
| 600 | 1 |
| 720 | 1 |
| 730 | 1 |
| 800 | 1 |
| 900 | 1 |
| 999 | 1 |
| 1125 | 1 |
The most common values (top 5) are:
Plotting this as a ggplot would result in a very ugly diagram. We see that using AirBnB to stay for 1 night in Istanbul seems very popular representing more than 60% of the top 5 booking durations.
There could be several reasons why this is the case, but our hypothesis is that:
We will now filter the minimum_nights so that only observations <= 4 are included in the dataset
## [1] 76922
total_cost_1 <- listings_cleaned %>%
filter(guests_included == 1) %>%
mutate(cost = (4 * price + 4 * extra_people + cleaning_fee))
total_cost_2 <- listings_cleaned %>%
filter(guests_included >= 2) %>%
mutate(cost = (4 * price + cleaning_fee))
listings_cleaned <- full_join(total_cost_1, total_cost_2, copy = FALSE)
listings_cleaned <- listings_cleaned %>%
mutate(price_4_nights = cost)
options("scipen"=100, "digits"=4)
ggplot(listings_cleaned, aes(x=price_4_nights)) + geom_histogram() + labs(title = "Price of staying at an Airbnb location for 4 nights", x = "Costs of 4-night stays", y = "No. of couples")ggplot(listings_cleaned, aes(x=price_4_nights)) + geom_histogram() + scale_x_log10() +labs(title = "Price of staying at an Airbnb location for 4 nights", x = "Costs of 4-night stays", y = "No. of couples") For the regression model we would think of the following explanatory variables:
These are based on our own experience when we’re looking for places to stay. We have found that the above variables usually play a big part in determining the price of the listing.
model1 <- lm(price_4_nights ~ number_of_reviews + prop_type_simplified + review_scores_rating, data = listings_cleaned)
summary(model1)##
## Call:
## lm(formula = price_4_nights ~ number_of_reviews + prop_type_simplified +
## review_scores_rating, data = listings_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2641 -1087 -636 81 305432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2665.804 493.831 5.40 0.000000069
## number_of_reviews -0.744 2.238 -0.33 0.740
## prop_type_simplifiedBoutique hotel 452.912 327.261 1.38 0.166
## prop_type_simplifiedHouse -21.031 322.412 -0.07 0.948
## prop_type_simplifiedOther 246.776 208.227 1.19 0.236
## prop_type_simplifiedServiced apartment 439.449 280.231 1.57 0.117
## review_scores_rating -10.772 5.261 -2.05 0.041
##
## (Intercept) ***
## number_of_reviews
## prop_type_simplifiedBoutique hotel
## prop_type_simplifiedHouse
## prop_type_simplifiedOther
## prop_type_simplifiedServiced apartment
## review_scores_rating *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7190 on 9632 degrees of freedom
## (11272 observations deleted due to missingness)
## Multiple R-squared: 0.0011, Adjusted R-squared: 0.00048
## F-statistic: 1.77 on 6 and 9632 DF, p-value: 0.101
#According to model1, if the review score rating of an Airbnb location rises by 1, the price of a couple staying at that location for 4 nights would decrease by 10.77.
#According to model1, an increase in the number of boutique hotels by 1 would raise the cost of a 4-night stay for a couple by 452.91. An equivalent increase for serviced apartments would raise the cost by 439.45. Houses are the only type of property which have a negative relationship with price, decreasing the cost by 21.03 for each additional house. All other property types such as apartments, townhouses, condominiums, aparthotels, bed and breakfasts, and lofts, have counts positively correlated with the cost of a 4-night stay for a couple.
model2 <- lm(price_4_nights ~ number_of_reviews + room_type + prop_type_simplified + review_scores_rating, data = listings_cleaned)
summary(model2)##
## Call:
## lm(formula = price_4_nights ~ number_of_reviews + room_type +
## prop_type_simplified + review_scores_rating, data = listings_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3430 -944 -539 43 304411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2863.32 495.28 5.78 0.0000000076
## number_of_reviews -3.01 2.26 -1.33 0.184
## room_typeHotel room 782.09 374.01 2.09 0.037
## room_typePrivate room -879.57 165.56 -5.31 0.0000001104
## room_typeShared room -1460.05 638.07 -2.29 0.022
## prop_type_simplifiedBoutique hotel 381.81 366.80 1.04 0.298
## prop_type_simplifiedHouse 48.55 321.98 0.15 0.880
## prop_type_simplifiedOther 276.95 216.60 1.28 0.201
## prop_type_simplifiedServiced apartment 173.57 285.19 0.61 0.543
## review_scores_rating -9.34 5.26 -1.78 0.076
##
## (Intercept) ***
## number_of_reviews
## room_typeHotel room *
## room_typePrivate room ***
## room_typeShared room *
## prop_type_simplifiedBoutique hotel
## prop_type_simplifiedHouse
## prop_type_simplifiedOther
## prop_type_simplifiedServiced apartment
## review_scores_rating .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7170 on 9629 degrees of freedom
## (11272 observations deleted due to missingness)
## Multiple R-squared: 0.00567, Adjusted R-squared: 0.00474
## F-statistic: 6.1 on 9 and 9629 DF, p-value: 0.0000000137
In this part, we will further investigate the relationship between several variables to extend our analysis. First, an overview of the relationship we investigate will be given. Then, the relationship will be investigated, and finally, a written conclusion will be presented.
To start, we will investigate whether the number of bathrooms, bedrooms, beds or the size of the house are significant predictors of the price for a four night stay as it was calculated in the previous chapter.
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1373.6 81.9 16.8 <0.0000000000000002 ***
## bathrooms 545.6 51.1 10.7 <0.0000000000000002 ***
##
## Residual standard error: 7860 on 20829 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.00545, Adjusted R-squared: 0.0054
## F-statistic: 114 on 1 and 20829 DF, p-value: <0.0000000000000002
The t value of 10.9 tells us that the number of bathrooms is a significant predictor of the price for four nights. Our model estimates every bathroom adds about 556.1$ to the price of a four nights stay.
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1543.5 74.3 20.77 <0.0000000000000002 ***
## bedrooms 360.2 36.9 9.75 <0.0000000000000002 ***
##
## Residual standard error: 7880 on 20753 degrees of freedom
## (156 observations deleted due to missingness)
## Multiple R-squared: 0.00456, Adjusted R-squared: 0.00451
## F-statistic: 95.1 on 1 and 20753 DF, p-value: <0.0000000000000002
The t value of 10.0 tells us that the number of bedrooms is a significant predictor of the price for four nights. Our model estimates every bedroom adds about 371.4$ to the price of a four nights stay.
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1338.8 74.7 17.9 <0.0000000000000002 ***
## beds 315.1 25.8 12.2 <0.0000000000000002 ***
##
## Residual standard error: 7600 on 20259 degrees of freedom
## (650 observations deleted due to missingness)
## Multiple R-squared: 0.0073, Adjusted R-squared: 0.00725
## F-statistic: 149 on 1 and 20259 DF, p-value: <0.0000000000000002
The t value of 12.6 tells us that the number of beds is a significant predictor of the price for four nights. Our model estimates every bed adds about 326.7$ to the price of a four nights stay.
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 571.0 93.9 6.08 0.0000000012 ***
## accommodates 460.6 24.3 18.95 < 0.0000000000000002 ***
##
## Residual standard error: 7800 on 20909 degrees of freedom
## Multiple R-squared: 0.0169, Adjusted R-squared: 0.0168
## F-statistic: 359 on 1 and 20909 DF, p-value: <0.0000000000000002
The t value of 19.79 tells us that the amount of people an AirBnB can accommodate is a significant predictor of the price for four nights. Our model estimates every person an apartment can accommodate adds about 481.7$ to the price of a four nights stay.
mymodele <- lm(price_4_nights ~ bathrooms + bedrooms + beds + accommodates, data = listings_cleaned)
msummary(mymodele)## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 409.20 99.56 4.11 0.0000397 ***
## bathrooms 338.56 71.01 4.77 0.0000019 ***
## bedrooms -120.36 55.80 -2.16 0.031 *
## beds 8.28 35.41 0.23 0.815
## accommodates 412.02 31.04 13.27 < 0.0000000000000002 ***
##
## Residual standard error: 7580 on 20109 degrees of freedom
## (797 observations deleted due to missingness)
## Multiple R-squared: 0.0176, Adjusted R-squared: 0.0174
## F-statistic: 90.3 on 4 and 20109 DF, p-value: <0.0000000000000002
When running our model for all the variables at the same time. At first sight, we find that the amount of beds is not a significant predictor anymore and that extra bedrooms actually decrease the price of the four night stay. It is important to note that our multi-collinearity test works best when there is zero correlation between the explanatory variables. No test is needed to see that this is not the case in our example, the amount of people an apartment can accommodate and the number of bedrooms is obviously correlated to the number of beds and so is the amount of bathrooms. It is for this reason that we cannot draw conclusions from the above linear model without making appropriate adaptations.
Lets investigate whether the fact that a property has or has not listed its exact location is a significant predictor of the price of our four night stay. Using a similar strategy as above, we first looked at the available variables. We suspect we will have to control for the effect of a listing being complete in its information in general so that we will find the effect of specificly having the exact location available. In the following code, we create a variable that is 0 is no summary is available and 1 if a summary is available. We do the same thing for space, description, notes, transit, access and interaction.
listings_cleaned_g <- listings_cleaned %>%
mutate(summary_available = ifelse(is.na(summary), 0, 1),
space_available = ifelse(is.na(space), 0, 1),
description_available = ifelse(is.na(description), 0, 1),
notes_available = ifelse(is.na(notes), 0, 1),
transit_available = ifelse(is.na(transit), 0, 1),
access_available = ifelse(is.na(access), 0, 1),
interaction_available = ifelse(is.na(interaction), 0, 1))
glimpse(listings_cleaned_g) ## Rows: 20,911
## Columns: 94
## $ listing_url <chr> "https://www.airbnb.com/…
## $ name <chr> "↪ Istanbul, Your second…
## $ summary <chr> NA, NA, NA, NA, NA, "Hi!…
## $ space <chr> "There are many interest…
## $ description <chr> "There are many interest…
## $ notes <chr> NA, NA, NA, NA, "Please …
## $ transit <chr> "There are bus stops to …
## $ access <chr> NA, NA, NA, NA, "You wil…
## $ interaction <chr> NA, NA, NA, NA, "The are…
## $ house_rules <chr> "Non smoker or (Email hi…
## $ host_since <date> 2010-05-16, 2010-05-25,…
## $ host_response_time <chr> "N/A", "N/A", "N/A", "N/…
## $ host_response_rate <chr> "N/A", "N/A", "N/A", "N/…
## $ host_acceptance_rate <chr> "N/A", "N/A", "N/A", "N/…
## $ host_is_superhost <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_neighbourhood <chr> "Beyoglu", "Taksim", "Ka…
## $ host_listings_count <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_total_listings_count <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ host_verifications <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic <lgl> TRUE, TRUE, FALSE, TRUE,…
## $ host_identity_verified <lgl> FALSE, TRUE, FALSE, FALS…
## $ street <chr> "Istanbul, Istanbul, Tur…
## $ neighbourhood <chr> "Beyoglu", "Beyoglu", "K…
## $ neighbourhood_cleansed <chr> "Beyoglu", "Beyoglu", "B…
## $ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, …
## $ city <chr> "Istanbul", "Istanbul Pr…
## $ state <chr> "Istanbul", NA, NA, NA, …
## $ zipcode <chr> "34445", "34433", "34425…
## $ market <chr> "Istanbul", "Istanbul", …
## $ smart_location <chr> "Istanbul, Turkey", "Ist…
## $ latitude <dbl> 41.05, 41.03, 41.03, 41.…
## $ longitude <dbl> 28.95, 28.98, 28.98, 29.…
## $ is_location_exact <lgl> FALSE, FALSE, FALSE, FAL…
## $ property_type <chr> "Apartment", "Apartment"…
## $ room_type <chr> "Private room", "Private…
## $ accommodates <dbl> 3, 2, 1, 2, 3, 1, 1, 2, …
## $ bathrooms <dbl> 1.0, 1.0, 1.0, NA, 1.0, …
## $ bedrooms <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ beds <dbl> 1, 1, 1, NA, 2, 1, NA, 1…
## $ bed_type <chr> "Real Bed", "Real Bed", …
## $ amenities <chr> "{TV,Wifi,Kitchen,Breakf…
## $ square_feet <dbl> 0, NA, NA, NA, NA, NA, N…
## $ price <dbl> 343, 768, 473, 514, 514,…
## $ weekly_price <chr> "$2,297.00", "$1,108.00"…
## $ monthly_price <chr> NA, "$3,077.00", NA, NA,…
## $ security_deposit <chr> NA, NA, NA, NA, "$686.00…
## $ cleaning_fee <dbl> 0, 154, 0, 0, 309, 69, 0…
## $ guests_included <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ extra_people <dbl> 274, 0, 0, 0, 0, 0, 0, 1…
## $ minimum_nights <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_minimum_nights <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_minimum_nights <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ minimum_maximum_nights <dbl> 15, 2, 730, 730, 3, 30, …
## $ maximum_maximum_nights <dbl> 15, 2, 730, 730, 3, 30, …
## $ minimum_nights_avg_ntm <dbl> 3, 1, 1, 1, 3, 3, 1, 2, …
## $ maximum_nights_avg_ntm <dbl> 15, 2, 730, 730, 3, 30, …
## $ availability_30 <dbl> 29, 30, 30, 0, 25, 30, 3…
## $ availability_60 <dbl> 59, 60, 60, 0, 55, 60, 6…
## $ availability_90 <dbl> 89, 90, 90, 0, 85, 90, 9…
## $ availability_365 <dbl> 364, 365, 365, 0, 360, 3…
## $ number_of_reviews <dbl> 0, 1, 0, 0, 9, 0, 0, 0, …
## $ number_of_reviews_ltm <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ first_review <date> NA, 2010-06-14, NA, NA,…
## $ last_review <date> NA, 2010-06-14, NA, NA,…
## $ review_scores_rating <dbl> NA, 80, NA, NA, 93, NA, …
## $ review_scores_accuracy <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_cleanliness <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_checkin <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_communication <dbl> NA, NA, NA, NA, 9, NA, N…
## $ review_scores_location <dbl> NA, NA, NA, NA, 8, NA, N…
## $ review_scores_value <dbl> NA, NA, NA, NA, 8, NA, N…
## $ requires_license <lgl> FALSE, FALSE, FALSE, FAL…
## $ license <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable <lgl> FALSE, TRUE, FALSE, FALS…
## $ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy <chr> "strict_14_with_grace_pe…
## $ require_guest_profile_picture <lgl> FALSE, TRUE, FALSE, FALS…
## $ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count <dbl> 1, 1, 1, 1, 6, 1, 1, 1, …
## $ calculated_host_listings_count_entire_homes <dbl> 0, 0, 0, 0, 4, 0, 0, 1, …
## $ calculated_host_listings_count_private_rooms <dbl> 1, 1, 1, 1, 1, 1, 1, 0, …
## $ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month <dbl> NA, 0.01, NA, NA, 0.08, …
## $ prop_type_simplified <chr> "Apartment", "Apartment"…
## $ cost <dbl> 2468, 3226, 1892, 2056, …
## $ price_4_nights <dbl> 2468, 3226, 1892, 2056, …
## $ summary_available <dbl> 0, 0, 0, 0, 0, 1, 0, 0, …
## $ space_available <dbl> 1, 1, 1, 1, 1, 0, 1, 1, …
## $ description_available <dbl> 1, 1, 1, 1, 1, 1, 1, 1, …
## $ notes_available <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ transit_available <dbl> 1, 0, 0, 0, 1, 0, 0, 0, …
## $ access_available <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
## $ interaction_available <dbl> 0, 0, 0, 0, 1, 0, 0, 0, …
After glancing at our new variables, we see that there is a very strong correlation between them but this should not be a problem when we only want to test for the significance of the exact location and use these variables to control for other factors.
mymodelg <- lm(price_4_nights ~ is_location_exact + summary_available + description_available + notes_available + transit_available + access_available + interaction_available, data=listings_cleaned_g)
msummary(mymodelg)## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2041.5 158.9 12.85 < 0.0000000000000002 ***
## is_location_exactTRUE 458.5 114.0 4.02 0.000058 ***
## summary_available 79.7 287.9 0.28 0.78193
## description_available 105.2 322.4 0.33 0.74431
## notes_available 91.9 160.7 0.57 0.56744
## transit_available -357.7 148.6 -2.41 0.01605 *
## access_available -79.3 156.8 -0.51 0.61321
## interaction_available -508.2 153.8 -3.30 0.00096 ***
##
## Residual standard error: 7860 on 20903 degrees of freedom
## Multiple R-squared: 0.00293, Adjusted R-squared: 0.0026
## F-statistic: 8.78 on 7 and 20903 DF, p-value: 0.0000000000813
We conclude that the availabliity of an exact locatyion of the listing is a significant predictor for the price of our 4 night stay since the t value is 4.19. Again, do note that this model only explains a very small part of the price as we see that Adjusted R-squared is only 0.00248.
For this question, we will start by grouping the neighbourhoods to divide them into 6 geographical areas: Center (C), North (N), East (E), Far East (FE), West (W), Far West (FW).
listings_cleaned_g <- listings_cleaned %>%
mutate(neighbourhood_simplified = case_when(neighbourhood_cleansed == "Atasehir" ~ "East",
neighbourhood_cleansed == "Bagcilar" ~ "West",
neighbourhood_cleansed == "Bakirkoy" ~ "West",
neighbourhood_cleansed == "Bayrampasa" ~ "West",
neighbourhood_cleansed == "Beykoz" ~ "North",
neighbourhood_cleansed == "Beyoglu" ~ "Center",
neighbourhood_cleansed == "Catalca" ~ "Far West",
neighbourhood_cleansed == "Esenler" ~ "West",
neighbourhood_cleansed == "Eyup" ~ "North",
neighbourhood_cleansed == "Gaziosmanpasa"~ "West",
neighbourhood_cleansed == "Kadikoy" ~ "East",
neighbourhood_cleansed == "Kartal" ~ "Far East",
neighbourhood_cleansed == "Maltepe" ~ "East",
neighbourhood_cleansed == "Sancaktepe" ~ "Far East",
neighbourhood_cleansed == "Sile" ~ "Far East",
neighbourhood_cleansed == "Sisli" ~ "Center",
neighbourhood_cleansed == "Sultangazi" ~ "Far West",
neighbourhood_cleansed == "Umraniye" ~ "East",
neighbourhood_cleansed == "Zeytinburnu" ~ "West",
neighbourhood_cleansed == "Arnavutkoy" ~ "North",
neighbourhood_cleansed == "Avcilar" ~ "Far West",
neighbourhood_cleansed == "Bahcelievler" ~ "West",
neighbourhood_cleansed == "Basaksehir" ~ "Far West",
neighbourhood_cleansed == "Besiktas" ~ "Center",
neighbourhood_cleansed == "Beylikduzu" ~ "Far West",
neighbourhood_cleansed == "Buyukcekmece" ~ "Far West",
neighbourhood_cleansed == "Cekmekoy" ~ "Far East",
neighbourhood_cleansed == "Esenyurt" ~ "Far West",
neighbourhood_cleansed == "Fatih" ~ "Center",
neighbourhood_cleansed == "Gungoren" ~ "West",
neighbourhood_cleansed == "Kagithane" ~ "North",
neighbourhood_cleansed == "Kucukcekmece" ~ "Far West",
neighbourhood_cleansed == "Pendik" ~ "Far East",
neighbourhood_cleansed == "Sariyer" ~ "North",
neighbourhood_cleansed == "Silivri" ~ "Far West",
neighbourhood_cleansed == "Sultanbeyli" ~ "Far East",
neighbourhood_cleansed == "Tuzla" ~ "Far East",
neighbourhood_cleansed == "Uskudar" ~ "Center")) Now, lets create a model that will show us whether the areas we just created are significant predictors of the price of the four night stay. In this model, the estimates will be calculated as the difference between the price in the center and the price in that region. e.g. if the estimate for West would be -500 it would mean the price is estimated to be 500$ lower in the West compared to the center of the city.
mymodelg <- lm(price_4_nights ~ neighbourhood_simplified, data = listings_cleaned_g)
msummary(mymodelg)## Estimate Std. Error t value
## (Intercept) 2255.7 67.3 33.54
## neighbourhood_simplifiedEast -1169.7 162.3 -7.21
## neighbourhood_simplifiedFar East -667.2 304.8 -2.19
## neighbourhood_simplifiedFar West -141.5 225.9 -0.63
## neighbourhood_simplifiedNorth -130.2 233.8 -0.56
## neighbourhood_simplifiedWest -718.0 275.1 -2.61
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## neighbourhood_simplifiedEast 0.00000000000059 ***
## neighbourhood_simplifiedFar East 0.0286 *
## neighbourhood_simplifiedFar West 0.5309
## neighbourhood_simplifiedNorth 0.5776
## neighbourhood_simplifiedWest 0.0091 **
##
## Residual standard error: 7890 on 20758 degrees of freedom
## (147 observations deleted due to missingness)
## Multiple R-squared: 0.00279, Adjusted R-squared: 0.00255
## F-statistic: 11.6 on 5 and 20758 DF, p-value: 0.0000000000317
We observe a p-value for our model of 4.63e-12. This means that the area to which a property belongs is indeed a significant predictor of the price of our four night stay. As we suspected, the properties in the center of the city are the most expensive for us to stay at for four nights.
For this question, we will investigate whether the cancellation policy has an effect on the price of our four night stay. Using a similar strategy as for the previous questions, we first looked at the available variables.
mymodeli <- lm(price_4_nights ~ cancellation_policy + bedrooms + bathrooms + accommodates, data = listings_cleaned)
summary(mymodeli) ##
## Call:
## lm(formula = price_4_nights ~ cancellation_policy + bedrooms +
## bathrooms + accommodates, data = listings_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16835 -1124 -693 -114 306202
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 418.0 105.6 3.96
## cancellation_policymoderate -307.0 145.9 -2.10
## cancellation_policystrict -342.3 7840.4 -0.04
## cancellation_policystrict_14_with_grace_period 425.2 147.7 2.88
## cancellation_policysuper_strict_30 1072.2 2176.0 0.49
## cancellation_policysuper_strict_60 818.2 7840.4 0.10
## bedrooms -135.8 55.6 -2.44
## bathrooms 334.9 72.2 4.64
## accommodates 434.3 28.8 15.11
## Pr(>|t|)
## (Intercept) 0.0000757 ***
## cancellation_policymoderate 0.035 *
## cancellation_policystrict 0.965
## cancellation_policystrict_14_with_grace_period 0.004 **
## cancellation_policysuper_strict_30 0.622
## cancellation_policysuper_strict_60 0.917
## bedrooms 0.015 *
## bathrooms 0.0000036 ***
## accommodates < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7840 on 20677 degrees of freedom
## (225 observations deleted due to missingness)
## Multiple R-squared: 0.0186, Adjusted R-squared: 0.0182
## F-statistic: 49 on 8 and 20677 DF, p-value: <0.0000000000000002
We see that cancellation_policy indeed is a strong explanatory variable for the price for a 4-night stay in Istanbul. We see that the cancellation policy of a listing negatively affects the price for four nights. If a listing has a moderate cancellation policy, the price for four nights is lowered by $307
If a listing has a strict policy with a 14 days grace period, the listing price is lowered by $342
| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| number_of_reviews | 1.014 | 1 | 1.007 |
| prop_type_simplified | 1.025 | 4 | 1.003 |
| review_scores_rating | 1.023 | 1 | 1.012 |
Model 1 looks OK regarding VIF-scores.
| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| number_of_reviews | 1.039 | 1 | 1.020 |
| room_type | 1.401 | 3 | 1.058 |
| prop_type_simplified | 1.390 | 4 | 1.042 |
| review_scores_rating | 1.026 | 1 | 1.013 |
Model 2 looks OK regarding VIF-scores.
Only one explanatory variable for these models hence no need to check for multicollinearity.
| x | |
|---|---|
| bathrooms | 2.065 |
| bedrooms | 2.431 |
| beds | 1.884 |
| accommodates | 1.685 |
Again a rather low VIF-score across the board.
VIF throws an error here saying there’s atleast one aliased coefficient in the model, meaning that they share a perfect multicollinearity. Let’s find out which ones those are:
## Model :
## price ~ host_is_superhost + host_total_listings_count + host_has_profile_pic +
## host_identity_verified + number_of_verifications
##
## Complete :
## (Intercept) host_is_superhostTRUE
## number_of_verifications 20911 0
## host_total_listings_count host_has_profile_picTRUE
## number_of_verifications 0 0
## host_identity_verifiedTRUE
## number_of_verifications 0
It seems that number_of_verifications is our culprit, let’s run the regression without it.
modelf_adjusted <- lm(price ~ host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified, data = listings_cleaned_f)
vif(modelf_adjusted) %>%
kable() %>%
kable_styling()| x | |
|---|---|
| host_is_superhost | 1.021 |
| host_total_listings_count | 1.002 |
| host_has_profile_pic | 1.001 |
| host_identity_verified | 1.021 |
This seem to have solved the problem, now let’s see if our regression tells us something novel.
##
## Call:
## lm(formula = price ~ host_is_superhost + host_total_listings_count +
## host_has_profile_pic + host_identity_verified, data = listings_cleaned_f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2884 -300 -195 -17 76492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 700.1891 203.9621 3.43 0.0006 ***
## host_is_superhostTRUE 95.6749 43.4600 2.20 0.0277 *
## host_total_listings_count 1.8947 0.0556 34.11 <0.0000000000000002 ***
## host_has_profile_picTRUE -274.1047 204.4635 -1.34 0.1801
## host_identity_verifiedTRUE -48.6547 36.6238 -1.33 0.1840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1910 on 20905 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0531, Adjusted R-squared: 0.0529
## F-statistic: 293 on 4 and 20905 DF, p-value: <0.0000000000000002
We still find that there indeed is a price premium if a host is a super-host. Furthermore, we also still see that the total listings count have an impact on the price (premium). R-squared is still 0.0529, so removing the variable did not improve our model.
Fewer than 2 terms so no use in VIF
huxreg(list(
"Model 1" = model1,
"Model 2" = model2,
"Model A" = mymodela,
"Model B" = mymodelb,
"Model C" = mymodelc,
"Model D" = mymodeld,
"Model E" = mymodele,
"Model F" = modelf_adjusted,
"Model G" = mymodelg,
"Model I" = mymodeli), statistics = c
('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05
) %>%
kable() %>%
kable_styling()| names | Model 1 | Model 2 | Model A | Model B | Model C | Model D | Model E | Model F | Model G | Model I | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Model 1 | Model 2 | Model A | Model B | Model C | Model D | Model E | Model F | Model G | Model I | ||
| 1 | (Intercept) | 2665.80373722167 *** | 2863.31649679719 *** | 1373.56304760703 *** | 1543.52597283389 *** | 1338.84199596149 *** | 570.979500460525 *** | 409.195756322163 *** | 700.189143881403 *** | 2255.7496910215 *** | 417.974176232004 *** |
| 2 | (493.831198874327) | (495.276529084339) | (81.8730132989876) | (74.3094507813971) | (74.6520376499834) | (93.877103151643) | (99.5605282824683) | (203.962120603776) | (67.2501312358082) | (105.591931170869) | |
| 3 | number_of_reviews | -0.74365691852765 | -3.00811449064875 | ||||||||
| 4 | (2.23829568286529) | (2.2615722733781) | |||||||||
| 5 | prop_type_simplifiedBoutique hotel | 452.911525981365 | 381.814989939969 | ||||||||
| 6 | (327.261470219716) | (366.797886639531) | |||||||||
| 7 | prop_type_simplifiedHouse | -21.0310807257259 | 48.5474398256521 | ||||||||
| 8 | (322.411611960802) | (321.977894288409) | |||||||||
| 9 | prop_type_simplifiedOther | 246.776206573285 | 276.954622437153 | ||||||||
| 10 | (208.226775099041) | (216.603865928541) | |||||||||
| 11 | prop_type_simplifiedServiced apartment | 439.449031129747 | 173.57496327442 | ||||||||
| 12 | (280.230585672306) | (285.190623768192) | |||||||||
| 13 | review_scores_rating | -10.7719725200272 * | -9.33729991559724 | ||||||||
| 14 | (5.26067880215465) | (5.25629789585014) | |||||||||
| 15 | room_typeHotel room | 782.088517178091 * | |||||||||
| 16 | (374.010970820755) | ||||||||||
| 17 | room_typePrivate room | -879.566661877273 *** | |||||||||
| 18 | (165.55993892689) | ||||||||||
| 19 | room_typeShared room | -1460.05389137127 * | |||||||||
| 20 | (638.071183545976) | ||||||||||
| 21 | bathrooms | 545.579120408411 *** | 338.559250180018 *** | 334.946276051673 *** | |||||||
| 22 | (51.0682019480521) | (71.0075140053) | (72.2415497478511) | ||||||||
| 23 | bedrooms | 360.15800509051 *** | -120.357557938856 * | -135.765654410497 * | |||||||
| 24 | (36.9384530066663) | (55.7965854005025) | (55.5629207170478) | ||||||||
| 25 | beds | 315.063166987962 *** | 8.27821994414329 | ||||||||
| 26 | (25.8195537938598) | (35.4107986206482) | |||||||||
| 27 | accommodates | 460.635680255991 *** | 412.016941287965 *** | 434.304439344051 *** | |||||||
| 28 | (24.3100997387693) | (31.0448438353153) | (28.7518162714076) | ||||||||
| 29 | host_is_superhostTRUE | 95.6748925909414 * | |||||||||
| 30 | (43.4600415702343) | ||||||||||
| 31 | host_total_listings_count | 1.89471350281978 *** | |||||||||
| 32 | (0.0555547777010792) | ||||||||||
| 33 | host_has_profile_picTRUE | -274.104682752846 | |||||||||
| 34 | (204.463456240145) | ||||||||||
| 35 | host_identity_verifiedTRUE | -48.6547439728043 | |||||||||
| 36 | (36.6238421121075) | ||||||||||
| 37 | neighbourhood_simplifiedEast | -1169.74232776797 *** | |||||||||
| 38 | (162.279614900289) | ||||||||||
| 39 | neighbourhood_simplifiedFar East | -667.214179657748 * | |||||||||
| 40 | (304.77261085548) | ||||||||||
| 41 | neighbourhood_simplifiedFar West | -141.533697000572 | |||||||||
| 42 | (225.867042623846) | ||||||||||
| 43 | neighbourhood_simplifiedNorth | -130.188852987599 | |||||||||
| 44 | (233.773328969641) | ||||||||||
| 45 | neighbourhood_simplifiedWest | -718.028867222633 ** | |||||||||
| 46 | (275.134302616739) | ||||||||||
| 47 | cancellation_policymoderate | -306.981032481855 * | |||||||||
| 48 | (145.905660066239) | ||||||||||
| 49 | cancellation_policystrict | -342.290538535808 | |||||||||
| 50 | (7840.43795726366) | ||||||||||
| 51 | cancellation_policystrict_14_with_grace_period | 425.158931840075 ** | |||||||||
| 52 | (147.715379644877) | ||||||||||
| 53 | cancellation_policysuper_strict_30 | 1072.203229539 | |||||||||
| 54 | (2175.98854205583) | ||||||||||
| 55 | cancellation_policysuper_strict_60 | 818.236323438462 | |||||||||
| 56 | (7840.35438340087) | ||||||||||
| 1.1 | #observations | 9639 | 9639 | 20831 | 20755 | 20261 | 20911 | 20114 | 20910 | 20764 | 20686 |
| 2.1 | R squared | 0.00110229510886997 | 0.00566526329785218 | 0.0054497010682834 | 0.00455998011420604 | 0.00729624591033085 | 0.0168816238634554 | 0.0176374108553016 | 0.0530675565870543 | 0.00278923205362719 | 0.0186233842383247 |
| 3.1 | Adj. R Squared | 0.000480058166454245 | 0.00473588198823349 | 0.00540195272227861 | 0.00451201403605406 | 0.00724724528077902 | 0.0168346049540797 | 0.0174420033086021 | 0.0528863688437561 | 0.00254903290921371 | 0.0182436863650309 |
| 4.1 | Residual SE | 7186.16431911236 | 7170.84913021399 | 7864.41963946265 | 7881.82196146728 | 7596.76614539442 | 7804.48253368504 | 7584.38521536803 | 1913.23437804512 | 7887.21053226126 | 7840.0267645568 |
| .1 | *** p < 0.001; ** p < 0.01; * p < 0.05. |
Looking at our huxtable, we see that Model F seems to have the best explanatory power for staying at an AirBnB for four nights in Istanbul. However, it’s important to note that all of our models have a very poor prediction power…
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 700.189 | 203.9621 | 3.433 | 0.0006 |
| host_is_superhostTRUE | 95.675 | 43.4600 | 2.201 | 0.0277 |
| host_total_listings_count | 1.895 | 0.0556 | 34.105 | 0.0000 |
| host_has_profile_picTRUE | -274.105 | 204.4635 | -1.341 | 0.1801 |
| host_identity_verifiedTRUE | -48.655 | 36.6238 | -1.329 | 0.1840 |
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0531 | 0.0529 | 1913 | 292.9 | 0 | 4 | -187675 | 375362 | 375410 | 76522037242 | 20905 | 20910 |
That is:
Model: 700.189+95.675(host_is_superhostTRUE)+1.895(host_total_listings_count)-274.105(host_has_profile_picTRUE)-48.655(host_identity_verifiedTRUE)
Following the filter criteria set forth in the problem, we predict the cost:
final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90) %>%
group_by(listing_url) %>%
mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 95.675,
host_is_superhost == FALSE ~ 0)) %>%
mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.84,
host_total_listings_count == 0 ~ 0)) %>%
mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -478.568,
host_has_profile_pic == FALSE ~ 0)) %>%
mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -48.655,
host_identity_verified == FALSE ~ 0)) %>%
summarise(predicted_cost = sum(700.189 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified))We see that the total cost of staying at an AirBnB in Istanbul for four days fitting the criteria set forth above varies from: * Lowest price: 377.4 (found using slice()) * Highest price: 523.7
From our regression model we know that there is a standard error for every explanatory variable. Let’s find the 95% confidence interval for the above prices:
LOWER_final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90) %>%
group_by(listing_url) %>%
mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 52.215, #Minus the standard error for every variable
host_is_superhost == FALSE ~ 0)) %>%
mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.895,
host_total_listings_count == 0 ~ 0)) %>%
mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -274.105,
host_has_profile_pic == FALSE ~ 0)) %>%
mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -85.278,
host_identity_verified == FALSE ~ 0)) %>%
summarise(LOWER_predicted_cost = sum(496.227 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified)) HIGHER_final_prediction <- listings_cleaned %>%
filter(prop_type_simplified == "Apartment", room_type == "Private room", number_of_reviews >= 10, review_scores_rating > 90) %>%
group_by(listing_url) %>%
mutate(host_is_superhost = case_when(host_is_superhost == TRUE ~ 139.135,
host_is_superhost == FALSE ~ 0)) %>%
mutate(host_total_listings_count = case_when(host_total_listings_count > 0 ~ count(host_total_listings_count) * 1.95,
host_total_listings_count == 0 ~ 0)) %>%
mutate(host_has_profile_pic = case_when(host_has_profile_pic == TRUE ~ -69.642,
host_has_profile_pic == FALSE ~ 0)) %>%
mutate(host_identity_verified = case_when(host_identity_verified == TRUE ~ -12.032,
host_identity_verified == FALSE ~ 0)) %>%
summarise(predicted_cost = sum(904.151 + host_is_superhost + host_total_listings_count + host_has_profile_pic + host_identity_verified))To conclude, we therefore see that:
For our minimum and maximum price we see that the confidence interval set gives us: * minimum price with 95% confidence = 136.8 to 822.5 * maximum price with 95% confidence = 276.2 to 975.6
The wide price range tells us that our model is poor at predicting the price. However, this model F had the highest R-squared value out of the possible models.